Information of the project

## $file_count
## [1] 32026
## 
## $data_categories
##    file_count case_count                data_category
## 1        9971        571  Simple Nucleotide Variation
## 2        4875        584             Sequencing Reads
## 3        2731        585                  Biospecimen
## 4        1146        585                     Clinical
## 5        6070        518        Copy Number Variation
## 6        2334        519      Transcriptome Profiling
## 7        2130        579              DNA Methylation
## 8         365        365           Proteome Profiling
## 9           4          2 Somatic Structural Variation
## 10       2400        517         Structural Variation
## 
## $case_count
## [1] 585
## 
## $file_size
## [1] 2.944879e+14

Show all the columns that has not relevant information to analyze (empty)

##  [1] "height_cm_at_diagnosis"                     
##  [2] "weight_kg_at_diagnosis"                     
##  [3] "cause_of_death"                             
##  [4] "cause_of_death_other"                       
##  [5] "tobacco_smoking_age_started"                
##  [6] "hiv_status"                                 
##  [7] "nadir_cd4_counts"                           
##  [8] "cd4_counts_at_diagnosis"                    
##  [9] "hiv_rna_load_at_diagnosis"                  
## [10] "prior_aids_conditions"                      
## [11] "hbv_test_results"                           
## [12] "hcv_test_results"                           
## [13] "hpv_test_results"                           
## [14] "kshv_hhv8_test_results"                     
## [15] "haart_therapy_prior_to_dx"                  
## [16] "haart_therapy_at_dx"                        
## [17] "cdc_hiv_risk_group"                         
## [18] "prior_mailgnancy_type"                      
## [19] "history_immunological_disease"              
## [20] "eml4_alk_translocation_variant"             
## [21] "history_immunological_disease_other"        
## [22] "history_immunosuppresive_rx"                
## [23] "history_immunosuppressive_rx_other"         
## [24] "history_relevant_infectious_dx"             
## [25] "history_immunosuppresive_dx_other"          
## [26] "laterality"                                 
## [27] "method_initial_path_dx"                     
## [28] "method_initial_path_dx_other"               
## [29] "lymph_nodes_examined"                       
## [30] "lymph_nodes_examined_count"                 
## [31] "lymph_nodes_examined_he_count"              
## [32] "lymph_nodes_examined_ihc_count"             
## [33] "pos_lymph_node_location"                    
## [34] "other_pos_node_location"                    
## [35] "ajcc_tumor_clinical_ct"                     
## [36] "ajcc_nodes_clinical_cn"                     
## [37] "ajcc_metastasis_clinical_cm"                
## [38] "ajcc_clinical_tumor_stage"                  
## [39] "followup_lost_to"                           
## [40] "cancer_diagnosis_cancer_type_icd9_text_name"
## [41] "days_to_form_completion"                    
## [42] "days_to_hiv_diagnosis"                      
## [43] "days_to_patient_progression_free"           
## [44] "days_to_sample_procurement"                 
## [45] "days_to_tumor_progression"                  
## [46] "egfr_mutation_identified"                   
## [47] "eml4_alk_translocation_identified"          
## [48] "extranodal_involvement"                     
## [49] "family_member_relationship_type"            
## [50] "margin_status"                              
## [51] "measure_of_response"                        
## [52] "metastatic_site_at_diagnosis"               
## [53] "metastatic_site_at_diagnosis_other"         
## [54] "number_cycles"                              
## [55] "pharm_regimen"                              
## [56] "pharm_regimen_other"                        
## [57] "prior_systemic_therapy_type"                
## [58] "regimen_indication"                         
## [59] "relative_family_cancer_history"             
## [60] "stage_other"                                
## [61] "stem_cell_transplantation"
## Create the individual plots of age, Vital status and gender
plot_age <- ggplot(clinical_data_filter_col, aes(x = age_at_initial_pathologic_diagnosis)) +
  geom_histogram(binwidth = 10, color = "black", fill = "#7AC5CD", alpha = 0.7) +
  labs(title = "Age Distribution of LUAD Patients", x = "Age", y = "Abs. Frequency") +
  theme_minimal() + 
  theme(
    axis.title = element_text(size = 12, face = "bold", angle = 0, hjust = 0.5),
    title = element_text(size = 12, face = "bold", angle = 0, hjust = 0.5),
    axis.text = element_text(size = 12, face = "bold", angle = 0, hjust = 0.5)
  )

plot_vital_status <- ggplot(clinical_data_filter_col, aes(x = vital_status)) +
  geom_bar(fill = c("#66CDAA", "#CD1076"), color = "black") +
  labs(title = "Vital Status of LUAD Patients", x = "Vital Status", y = "Total") +
  theme_minimal() + 
  theme(
    axis.title = element_text(size = 12, face = "bold", angle = 0, hjust = 0.5),
    title = element_text(size = 12, face = "bold", angle = 0, hjust = 0.5),
    axis.text = element_text(size = 12, face = "bold", angle = 0, hjust = 0.5)
  )

plot_gender <- ggplot(clinical_data_filter_col, aes(x = gender)) +
  geom_bar(fill = c("#9A32CD", "#1874CD"), color = "black") +
  labs(title = "Gender Distribution of LUAD Patients", x = "Gender", y = "Total") +
  theme_minimal() + 
  theme(
    axis.title = element_text(size = 12, face = "bold", angle = 0, hjust = 0.5),
    title = element_text(size = 12, face = "bold", angle = 0, hjust = 0.5),
    axis.text = element_text(size = 12, face = "bold", angle = 0, hjust = 0.5)
  )

# Arrange the plots in a grid
grid_plot <- as.ggplot(grid.arrange(plot_age, plot_vital_status, plot_gender, ncol = 2))

ggsave('./Grid_plots.png',plot = grid_plot,width = 2200,height = 1600,units = 'px')
ggplot(clinical_data_filter_col, aes(x = age_at_initial_pathologic_diagnosis, fill = vital_status)) +
  geom_histogram(binwidth = 10, color='black') +
  labs(title = "Age Distribution Stratified by Gender and Vital Status", x = "Age", y = "Total") +
  theme_minimal() +
  theme(
    axis.title.x = element_text(size = 14, face = "bold", angle = 0, hjust = 0.5),
    axis.text.x = element_text(size = 12, angle = 45, hjust = 1),
    axis.text.y = element_text(size = 12)
  ) +
  facet_wrap(~ gender) +  # Facet by gender
  scale_fill_manual(values = c("#66CDAA", "#CD1076"))

# First step, filter data with NA in the column ajcc_pathologic_tumor_stage
clinical_data_violin <- clinical_data_filter_col %>% 
  filter(!is.na(ajcc_pathologic_tumor_stage))
# Violin plot to represent the age of the patients along AJCC tumor stage
ggplot(clinical_data_violin, aes(x = ajcc_pathologic_tumor_stage, y = age_at_initial_pathologic_diagnosis)) +
  geom_violin(fill = "lightblue", color = "black", alpha = 0.7) +
  geom_boxplot(width = 0.2, color = "black", alpha = 0.5, show.legend = FALSE) + 
  geom_jitter(aes(color = ajcc_pathologic_tumor_stage), width = 0.2, alpha = 0.6, size = 2) +
  labs(title = "Age Distribution by AJCC Pathologic Tumor Stage", x = "AJCC Pathologic Tumor Stage", y = "Age") +
  theme_minimal() +
  theme(
    axis.title = element_text(size = 14, face = "bold"),
    axis.text.x = element_blank(),
    axis.text.y = element_text(size = 12)
  )

data <- table(clinical_data_filter_col$anatomic_organ_subdivision)

# Convert the regions and the absolute frequency into a dataframe
df <- data.frame(
  Region = names(data),
  Count = as.numeric(data)
)

# Pie chart of the distribution of Lung Regions with plotly (dinamic representation)
plot_ly(df, 
                 labels = ~Region,
                 values = ~Count,
                 type = 'pie',
                 textinfo = 'label+percent',
                 hoverinfo = 'label+percent', 
                 marker = list(colors = c('#66CDAA', '#CD1076', '#1E90FF', '#FFD700', '#8A2BE2'))) %>%
  layout(
    title = "Distribution of Lung Regions", 
    showlegend = TRUE 
  )

Survival analysis

Kaplan-Meier survival curves

Tumor stage (survival distributions between the groups are significantly different)

fit <- survfit(Surv(survival_time, vital_status) ~ ajcc_pathologic_tumor_stage, data = clinical_data_survival)
ggsurvplot(fit, data = clinical_data_survival, pval = TRUE, risk.table = FALSE,
           legend = c(0.93, 0.6),
           legend.labs = c("Discrepancy","Stage I","Stage IA","Stage IB","Stage II","Stage IIA","Stage IIB", "Stage IIIA", "Stage IIIB", "Stage IV"),
           legend.title = element_blank()
) 

ggsurvplot(fit, data = clinical_data_survival, risk.table = TRUE,
           legend.labs = c("Discrepancy","Stage I","Stage IA","Stage IB","Stage II","Stage IIA","Stage IIB", "Stage IIIA", "Stage IIIB", "Stage IV"),
           legend.title = element_blank(),
           risk.table.height = 1
) 

EGFR mutation (survival distributions are the same)

fit_egfr <- survfit(Surv(survival_time, vital_status) ~ egfr_mutation_status, data = clinical_data_survival)
ggsurvplot(fit_egfr, data = clinical_data_survival, pval = TRUE)

KRAS mutation (survival distributions are the same)

fit_kras <- survfit(Surv(survival_time, vital_status) ~ kras_mutation_found, data = clinical_data_survival)
ggsurvplot(fit_kras, data = clinical_data_survival, pval = TRUE)

Cox Proportional Hazards Model

# Adjust the model
cox <- coxph(Surv(survival_time, vital_status) ~ age_at_initial_pathologic_diagnosis + ajcc_pathologic_tumor_stage + tobacco_smoking_pack_years_smoked, data = clinical_data_survival)
# summary(cox)
# Obtain the results, create a dataframe for the graphic and filter
summary_cox <- summary(cox)
hr_data <- data.frame(
  Variable = rownames(summary_cox$coefficients), 
  HR = exp(summary_cox$coefficients[, "coef"]),  
  lower_ci = exp(summary_cox$conf.int[, "lower .95"]), 
  upper_ci = exp(summary_cox$conf.int[, "upper .95"]),  
  p_value = summary_cox$coefficients[, "Pr(>|z|)"] 
)
hr_data_filtered <- hr_data %>% filter(p_value < 0.05)
# Forest plot (Only with relevant variables with significative effect on Death)
ggplot(hr_data_filtered, aes(x = HR, y = Variable)) +
  geom_point(aes(color = p_value < 0.05), size = 4) + 
  geom_errorbarh(aes(xmin = lower_ci, xmax = upper_ci), height = 0.2) +
  scale_x_log10() + 
  labs(
    title = "Hazard Ratios (HR) de Supervivencia con Regresión de Cox",
    x = "Hazard Ratio (HR)",
    y = "Variables"
  ) +
  theme_minimal() +
  theme(
    axis.title = element_text(size = 14, face = "bold"),
    axis.text = element_text(size = 12),
    title = element_text(size = 16, face = "bold"),
    axis.text.x = element_text(size = 12, angle = 45, hjust = 1)
  ) +
  geom_vline(xintercept = 1, linetype = "dashed", color = "red")